# Computations
import numpy as np
import pandas as pd
# scipy
import scipy.stats as stats
# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
# Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import missingno as msno
import plotly.offline as py
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Data = pd.read_csv('pima-indians-diabetes-database/diabetes.csv')
display(Data.head())
print('The Dataset Shape: %i rows and %i columns' % Data.shape)
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| BloodPressure | Diastolic blood pressure (mm Hg) |
| SkinThickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| DiabetesPedigreeFunction | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
def Data_info(Inp, Only_NaN = False):
Out = pd.DataFrame(Inp.dtypes,columns=['Data Type']).sort_values(by=['Data Type'])
Out = Out.join(pd.DataFrame(Inp.isnull().sum(), columns=['Number of NaN Values']), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
display(Data_info(Data).T[:2])
_ = msno.bar(Data, figsize=(12,3), fontsize=14, log=False, color="#34495e")
display(Data.describe())
Let's take a close look at our data.
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
for i in range(len(Data.columns[:-1])):
sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
kde_kws={"color": "k", "lw": 2, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 2,
"alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
if Data.iloc[:,i].name != 'BMI':
ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Data['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Data['Pregnancies']),
dict(label='Glucose', values=Data['Glucose']),
dict(label='Blood<br>Pressure', values=Data['BloodPressure']),
dict(label='Skin<br>Thickness', values=Data['SkinThickness']),
dict(label='Insulin', values=Data['Insulin']),
dict(label='BMI', values=Data['BMI']),
dict(label='Diabetes<br>Pedigree<br>Fun', values=Data['DiabetesPedigreeFunction']),
dict(label='Age', values=Data['Age'])],
showupperhalf=False,
marker=dict(color=Data['Outcome'], size=4, colorscale='Bluered',
line=dict(width=0.4, color='black')),
text=Temp, diagonal=dict(visible=False)))
del Temp
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
width=900, height=900, hovermode='closest')
fig.show()
As can be seen, the Data has a normal distribution, and some entries need to be adjusted. In doing so, we defined a normalizer as follows, for a given vector $x$,
\begin{align*} \text{Normalizer}(x, cut) = \begin{cases} x_i &\mbox{if } |x_i- \mu|<\sigma\times cut \\ mode(x) & \mbox{else} \end{cases}. \end{align*}def Normalizer(Col, cut = 3):
return Col[(Col > (Col.mean() - Col.std() * cut)) &
(Col < (Col.mean() + Col.std() * cut))]
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
Temp = Data.copy()
for i in range(len(Data.columns[:-1])):
Data[Data.columns[i]] = Normalizer(Data[Data.columns[i]])
Data[Data.columns[i]] = Data[Data.columns[i]].fillna(Data[Data.columns[i]].dropna().mode()[0])
# Sub-Plots
sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
kde_kws={"color": "k", "lw": 2, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 2,
"alpha": 1, "color": "Navy"}, ax= ax[int(i/2),i%2])
if Data.iloc[:,i].name != 'BMI':
ax[int(i/2),i%2].set_xlabel(re.sub(r"(\w)([A-Z])", r"\1 \2", Data.iloc[:,i].name))
Basically, we diminished the influence of certain data points (see the following figure).
Temp0 = Temp.copy()
Temp0.iloc[:,:-1] = abs(Data.iloc[:,:-1] - Temp.iloc[:,:-1])
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Temp0['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Temp0['Pregnancies']),
dict(label='Glucose', values=Temp0['Glucose']),
dict(label='Blood<br>Pressure', values=Temp0['BloodPressure']),
dict(label='Skin<br>Thickness', values=Temp0['SkinThickness']),
dict(label='Insulin', values=Temp0['Insulin']),
dict(label='BMI', values=Temp0['BMI']),
dict(label='Diabetes<br>Pedigree<br>Fun', values=Temp0['DiabetesPedigreeFunction']),
dict(label='Age', values=Temp0['Age'])],
showupperhalf=False,
marker=dict(color=Temp0['Outcome'], size=4, colorscale='Bluered',
line=dict(width=0.4, color='black')),
text=Temp, diagonal=dict(visible=False)))
del Temp, Temp0
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
width=900, height=900, hovermode='closest')
fig.show()
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr()
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .7})
bottom, top = ax.get_ylim()
Correlation_Plot (Data, 9)
Temp = Data.iloc[:,:-1].var().sort_values(ascending = False).to_frame(name= 'Variance')
display(Temp)
Temp0 = Data.corr()
Temp0.loc[Temp.index[-1]].sort_values().to_frame(name= 'Correlation')[:-1].T
Even though the variance of Diabetes Pedigree Function is low, this might not improve the performance of the model, the correlation of this feature with the reset of features, especially with the Outcome, is noticeable.
Target = 'Outcome'
X = Data.drop(columns = [Target])
y = Data[Target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Furthermore, we would like to standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
X_train_STD = scaler.fit_transform(X_train)
X_test_STD = scaler.transform(X_test)
X_train_STD = pd.DataFrame(data = X_train_STD, columns = X_train.columns)
X_test_STD = pd.DataFrame(data = X_test_STD, columns = X_test.columns)
Some other functions that we will be using later.
def Performance(clf, X_test = X_test_STD):
df = pd.DataFrame()
y_pred = clf.predict(X_test)
df = df.append({'Score': clf.score(X_test, y_test),
'F1 Score': f1_score(y_test.values, y_pred, average= 'weighted'),
'Precision Score': precision_score(y_test.values, y_pred, average= 'weighted'),
'Recall Score': recall_score(y_test.values, y_pred, average= 'weighted')}, ignore_index=True)
display(df.style.hide_index())
def highlight_max(s):
is_max = s == s.max()
return ['background-color: SpringGreen' if v else '' for v in is_max]
def Feature_Ranking(clf):
df = pd.DataFrame()
for n in range(2, X.shape[1]):
selector = RFE(estimator= clf, n_features_to_select=n, verbose=0)
selector.fit(X_train_STD, y_train)
df = df.append({'Number of Features to Select': n,
'Score':metrics.accuracy_score(y_test, selector.predict(X_test_STD)),
'Features': X.columns[selector.support_].tolist(),
'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)
df = df[['Number of Features to Select', 'Score', 'Features', 'Best Features']]
display(df.style.apply(highlight_max, subset=['Score']))
return df.loc[df.Score == df.Score.max(), 'Features'].values[0]
def ROC_Curve(clf, X_test = X_test_STD):
# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])
fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
The first classifier that we use here is Decision Tree Classifier.
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train_STD,y_train)
Performance(dtc)
ROC_Curve(dtc)
However, we could also use RFE from sklearn.feature_selection. This provides feature ranking with recursive feature elimination.
Best_Features = Feature_Ranking(dtc)
Thus, we could use only less features and improve the restuls of classications. The best features for the classifications are
print(Best_Features)
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train_STD[Best_Features],y_train)
Performance(dtc, X_test_STD[Best_Features])
ROC_Curve(dtc, X_test_STD[Best_Features])
RFE can be very useful, especially for cases that the number of features is a large number.
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD,y_train)
Performance(rfc)
ROC_Curve(rfc)
Best_Features = Feature_Ranking(rfc)
Thus, we could use only less features and improve the restuls of classications. The best features for the classifications are
print(Best_Features)
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD[Best_Features],y_train)
Performance(rfc, X_test_STD[Best_Features])
ROC_Curve(rfc, X_test_STD[Best_Features])
It can be seen overall, Random Forest Classifier performed better in this example. Furthermore, using RFE improves the accuracy of this classification.